INTRODUCTION

leaguedf <- read_csv('../data_sets/S13LeagueOfLegendsData.csv', 
                      col_types=c('c', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'c'), 
                      col_names=c('rowno', 'Name', 'Class', 'Role', 'Tier', 'Score', 'Trend', "WinRate", "RoleRate", "PickRate", "BanRate", 'KDA', 'Patch'), skip=1) %>%
  column_to_rownames("rowno") %>% 
  mutate(PickBanRate = PickRate + BanRate, 
         Patch = as.numeric(str_replace(Patch, '(.*?)_(.*?)', '')), 
         Role = str_to_title(Role))
leaguedf$Tier = as.factor(leaguedf$Tier) %>%
  fct_relevel(c("God", "S", "A", "B", "C", "D"))

head(leaguedf, 5)
##     Name    Class Role Tier Score  Trend WinRate RoleRate PickRate BanRate  KDA
## 1 Aatrox  Fighter  Top    S 57.63 -31.86  0.4768   0.9163   0.0662  0.1198 1.77
## 2   Ahri     Mage  Mid    S 57.18   4.55  0.4950   0.9465   0.0581  0.0173 2.58
## 3  Akali Assassin  Mid    S 65.49   4.33  0.4841   0.7574   0.0811  0.1302 2.37
## 4  Akali Assassin  Top    C 39.63  -1.51  0.4592   0.2350   0.0255  0.1302 2.00
## 5 Akshan Marksman  Mid    A 49.39   0.34  0.5162   0.6603   0.0275  0.0379 2.26
##   Patch PickBanRate
## 1     1      0.1860
## 2     1      0.0754
## 3     1      0.2113
## 4     1      0.1557
## 5     1      0.0654

AFTER WORKING WITH THE DATA AND DISCUSSING THE INFORMATION WITH YOUR GROUP, YOU SHOULD DESCRIBE 2 QUESTIONS THAT ARE CREATIVE AND INNOVATIVE. YOU SHOULD EXPLAIN WHY THESE QUESTIONS ARE INTERESTING AND WHY THEY DESERVE FURTHER INVESTIGATION. I ADVISE TO THINK OF REASONS WHY AN OWNER OF THE DATA MIGHT BENEFIT FROM ANSWERS TO THESE QUESTIONS. THINK OF REASONS WHY THE WORLD MAY BE INTERESTED IN THESE QUESITONS. THE PURPOSE OF THE INTRODUCTION IS TO STATE SOME INTERESTING QUESTIONS AND DEFEND THE VALUE OF THESE QUESTIONS. THIS INTRODUCTION SHOULD BE WRITTEN IN A WAY THAT SHOULD GET THE READER EXCITED ABOUT SEEING YOUR RESULTS. THIS SHOULD BE WRITTEN IN NO MORE THAN 4 PARAGRAPHS.

DATA

tempchamps <- leaguedf %>%
  complete(nesting(Name, Role), Patch) %>% # This explicitly finds champions who were only played in a role significantly for less than all of the patches!
  filter(!complete.cases(.)) %>%
  count(Name, Role)

tempdf <- leaguedf %>%
  filter(Name %in% tempchamps$Name & complete.cases(.)) %>%
  group_by(Name, Role) %>%
  summarize(
    n = n(),
    invN = 1/n(),
    meanWinRate = mean(WinRate),
    sdWinRate = sd(WinRate),
    meanPickBan = mean(PickBanRate),
    sdPickBan = sd(PickBanRate), 
    label = paste(Name, '\n', Role, sep = " ")
  ) %>%
  filter(n != 23)

p1 <- tempdf %>%
  ggplot() + geom_point(mapping = aes(x = meanWinRate, y = meanPickBan, alpha = invN), stroke = 0) +
  geom_vline(xintercept = .50, color = 'red') + 
  coord_trans(x = 'log10', y = 'log10') + 
  labs(x = "Win Rate Average", y = "Pick Ban Rate Average", title = "Win Rate vs Pick Ban Rate for Temporary champions") + 
  theme(legend.position = "none")
p2 <- tempdf %>% select(Name, Role, n) %>% distinct(Name, Role, n) %>%
  ggplot() + 
  geom_histogram(mapping = aes(x = n)) + 
  labs(x = "Patches Present", y = "Count", title = "Distribution of Temporary Champions")
p1 + p2

leaguedf %>%
  select("Name", "PickBanRate", "WinRate", "Role", "RoleRate", "Class", "Patch") %>%
  filter(!(Class == "NULL")) %>%
  group_by(Role) %>%
  group_map( ~ plot_ly(data = .,
      x = ~ PickBanRate,
      y = ~ WinRate,
      color = ~ Class,
      text = ~ Name,
      frame = ~ Patch, 
      hoverinfo = "text",
      type = "scatter",
      mode = "markers", 
      marker = list(size = ~ RoleRate*5)
      ), .keep = TRUE) %>%
  subplot(nrows = 2, shareX = TRUE, shareY=TRUE, margin=0.03) %>%
  layout(showlegend = FALSE, title = 'Pick Ban Rate vs. Win Rate by Patch seperated by Role',
         plot_bgcolor='#e5ecf6', 
         xaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'), 
         yaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'),
         margin = 0.07) %>%
  layout(annotations = annotations)
PbrCorrelation <- MakeCorrelationDf("PickBanRate", "Pbr") %>%
  group_by(Champion2) %>%
  arrange(PbrCorrelation) %>%
  mutate(label = case_when(
    row_number() <= 2 ~ str_to_title(str_replace(Champion1, '\\.', ' ')),
    row_number() > n() - 2 ~ str_to_title(str_replace(Champion1, '\\.', ' ')), # This adds a Space into the name where the . is  and uncapitalizes the second role
    Champion2 == "Tahm Kench.Support" & PbrCorrelation > 0.68 ~ "Senna Support", # This is an outlier so labeling is justified, especially since it helps show the part of the plot
    TRUE ~ as.character(NA)
  )) 

PbrCorrelation %>% filter(Champion2 %in% c("Tahm Kench Support", "Senna Support", "Ashe Adc")) %>% 
  ggplot(mapping = aes(x=Champion2, y = PbrCorrelation)) + 
  geom_boxplot() + 
  ggtitle("PBR Correlation Boxplot")+ 
  scale_x_discrete(labels = c("Ashe Adc", "Senna Support", "Tahm Kench Support")) + 
  labs(x = "", y = "Pick Ban Rate Correlation Coefficient", caption = "Minimum and Maximum corelation coefficients are annotated, as well as Senna Support for Tahm Kench Support in order\n to best visualize how the strength of certain counters, replacements, and synergies effect Pick Ban Rate.")  + 
  geom_text(aes(label = label), na.rm = TRUE, hjust = -0.1, size = 3, check_overlap = T) 

leaguedf %>% 
  group_by(Name) %>% 
  summarise(Mean_pick=mean(PickRate, na.rm = TRUE), Std_pick=sd(PickRate, na.rm=TRUE), Mean_win=mean(WinRate, na.rm = TRUE), Std_win=sd(WinRate, na.rm=TRUE)) %>% 
  arrange(desc(Mean_pick)) %>% 
  ggplot(aes(Mean_pick, Std_win)) + geom_point() + labs(x = "Mean Pick Rate", y = "Standard Deviation Win Rate") + 
  coord_trans(x = 'log10', y = 'log10') + 
  geom_smooth(aes(x = Mean_pick, y = Std_win), method = 'lm', se = F) # THIS LOOKS NON LINEAR BUT IT IS LINEAR, ITS JUST ON A LOG SCALE!!!!
## `geom_smooth()` using formula = 'y ~ x'

plot1 <- leaguedf %>%
  filter(Name %in% c("Fiora", "Darius", "Garen", "Aatrox", "Jax"), Role == "Top") %>%
  ggplot() + geom_count(aes(x = as.factor(Patch), y = Name, size = PickRate, color = Name)) + labs(x = "Patch", y = "Name", title = "Pick Rate")

plot2 <- leaguedf %>%
  filter(Name %in% c("Fiora", "Darius", "Garen", "Aatrox", "Jax"), Role == "Top") %>%
  ggplot() + geom_count(aes(x = as.factor(Patch), y = Name, size = BanRate, color = Name)) + labs(x = "Patch", y = "Name", title = "Ban Rate")

plot3 <- leaguedf %>%
  filter(Name %in% c("Fiora", "Darius", "Garen", "Aatrox", "Jax"), Role == "Top") %>%
  ggplot() + geom_count(aes(x = as.factor(Patch), y = Name, size = WinRate, color = Name)) + labs(x = "Patch", y = "Name", title = "Win Rate")

(plot1 / plot2/ plot3) + plot_annotation(title = "Analysis of Staple Top Champions")

IN LESS THAN 6 PARAGRAPHS, YOU SHOULD DESCRIBE THE DATA USED TO ANSWER THE QUESTIONS. YOU SHOULD EXPLAIN WHERE THE DATA ORIGINATED. FOR EXAMPLE, IT IS GOOD TO KNOW WHO COLLECTED THE DATA. JUST BECAUSE THE DATA CAME FROM KAGGLE, DOESN’T MEAN KAGGLE.COM COLLECTED THE DATA. GIVE AN IN-DEPTH DESCRIPTION OF THE SPECIFIC VARIABLES IN THE DATA REQUIRED TO ANSWER YOUR QUESTIONS. YOU SHOULDN’T DISCUSS ALL VARIABLES IN THE DATA IF YOU DIDN’T USE ALL VARIABLES IN THE DATA. YOU SHOULD EXPLAIN WHAT EACH OBSERVATION REPRESENTS (I.E. PEOPLE, SCHOOLS, STATES, CITIES, PATIENTS FROM A SPECIFIC HOSPITAL). WHAT IS THIS A SAMPLE OF? HOW MANY OBSERVATIONS DO YOU HAVE? AFTER READING THIS SECTION, THE READER SHOULD CLEARLY UNDERSTAND THE SOURCE AND CONTENT OF THE DATA YOU PLAN ON UTILIZING TO ANSWER YOUR QUESTIONS THAT YOU PROPOSED IN THE INTRODUCTION. AT LEAST ONE, DESCRIPTIVE TABLE AND AT LEAST ONE FIGURE SHOULD BE USED HERE TO HELP THE READER UNDERSTAND WHAT THE DATA LOOKS LIKE WITHOUT SEEING THE ENTIRE DATASET. IN ALL FIGURES AND TABLES, ONLY THE VARIABLES OF INTEREST SHOULD BE USED.

RESULTS

Clustering Analysis

#Cluster Analysis with K-Means


#Step 1: Normalize Data:
#First drop icky Vars and then Dummy encode Name, Class, and Role
#This is a high dimensional Data set

Normaldf <- leaguedf %>%
  select(-c(Tier, Score, Trend, PickRate, BanRate)) %>%
    pivot_wider(names_from = Role,
              values_from = Role,
              values_fn = function(x) 1,
              values_fill = 0) %>%
    mutate(Class = paste("Class: ", Class, sep = '')) %>%
    pivot_wider(names_from = Class,
                values_from = Class,
                values_fn = function(x) 1,
                values_fill = 0) %>%
    pivot_wider(names_from = Name,
                values_from = Name,
                values_fn = function(x) 1,
                values_fill =0) %>%
  mutate(
    WinRate = (WinRate - mean(WinRate))/sd(WinRate),
    RoleRate = (RoleRate - mean(RoleRate))/ sd(RoleRate),
    PickBanRate = (PickBanRate - mean(PickBanRate)) / sd(PickBanRate),
    KDA = (KDA - mean(KDA)) / sd(KDA),
    Patch = (Patch -mean(Patch)) / sd(Patch)
  )
  
#Step 2: Clusterize the Data

data <- kmeans(Normaldf, centers = 6, nstart = 25)

leaguedf$Cluster = as.factor(data$cluster)

#Reproducibility for Graphing purposes

ordering <- leaguedf %>%
  group_by(Cluster) %>%
  summarize(RoleRate = mean(RoleRate)) %>%
  arrange(RoleRate) %>%
  mutate(transformation = row_number())

transform <- function (x) {
  temp <- ordering %>%
    filter(Cluster == x)
  return (temp[[1, 3]])
}

leaguedf$Cluster <- sapply(leaguedf$Cluster, transform)

leaguedf <- leaguedf %>%
  mutate(Cluster = as.factor(Cluster))

plot1a <- leaguedf %>%
  ggplot() + 
  geom_point(mapping = aes(x = KDA, y = WinRate, color = Cluster), size = 0.75, alpha = 0.4) + 
  labs(x = "KDA", y = "Win Rate") + 
  theme_minimal()+ 
  theme(legend.position = "none") + 
  scale_color_manual(values = KMeansPalette)

plot1b <- leaguedf %>%
  ggplot() +
  geom_point(mapping = aes(x = PickBanRate, y = WinRate, color = Cluster), size = 0.75, alpha = 0.4) + 
  labs(x= "Pick/Ban Rate", y= "") + 
  theme_minimal()+ 
  scale_color_manual(values = KMeansPalette)+ 
  theme(legend.position = "bottom")+
     guides(color = guide_legend(override.aes = list(size = 3) ) )

plot1c <- leaguedf %>%
  ggplot() + 
  geom_boxplot(mapping = aes(x = Role, y = RoleRate, color = Cluster), lwd = 0.5) +
    labs(x = "Role", y = "Role %") + 
  theme_minimal() + 
  scale_color_manual(values = KMeansPalette)+ 
  theme(legend.position = "none")
design <- "
12
12
12
12
33
33
33
33
44"

KMeans <- wrap_elements(plot1a + plot1b + plot1c + guide_area() + 
  plot_layout(design = design, guides = "collect") &
  plot_annotation(title = "K Means"))


plot1a <- leaguedf %>%
  ggplot() + 
  geom_point(mapping = aes(x = KDA, y = WinRate, color = Tier), size = 0.75, alpha = 0.4) + 
  labs(x = "KDA", y = "Win Rate") + 
  theme_minimal()+ 
  scale_color_manual(values = TierPalette)+ 
  theme(legend.position = "none")

plot1b <- leaguedf %>%
  ggplot() +
  geom_point(mapping = aes(x = PickBanRate, y = WinRate, color = Tier), size = 0.75, alpha = 0.4) + 
  labs(x= "Pick/Ban Rate", y= "") + 
  theme_minimal()+ 
  scale_color_manual(values = TierPalette)+ 
  theme(legend.position = "bottom")+
     guides(color = guide_legend(override.aes = list(size = 3)))

plot1c <- leaguedf %>%
  ggplot() + 
  geom_boxplot(mapping = aes(x = Role, y = RoleRate, color = Tier), lwd= 0.5) + 
  labs(x = "Role", y = "Role %") + 
  scale_color_manual(values = TierPalette)+ 
  theme_minimal() + 
  theme(legend.position = "none")


Meta_Tiers <- wrap_elements(plot1a + plot1b + plot1c + guide_area() + 
  plot_layout(design = design, guides = "collect") &
  plot_annotation(title = "Meta SRC Tier"))


(KMeans | Meta_Tiers) & plot_annotation(title = "Cluster Analysis") & 
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold')) 

#Hierarchical Clustering
HCluster <- hclust(dist(Normaldf))

plot(HCluster, xlab = '', sub = '', cex = .9) #Dendrogram!!!

leaguedf$HClust <- as.factor(cutree(HCluster, 5))

plot1a <- leaguedf %>%
  ggplot() + 
  geom_point(mapping = aes(x = KDA, y = WinRate, color = HClust), size = 0.6, alpha = 0.8) + 
  labs(x = "KDA", y = "Win Rate") + 
  theme_minimal()+ 
  theme(legend.position = "none")

plot1b <- leaguedf %>%
  ggplot() +
  geom_point(mapping = aes(x = PickBanRate, y = WinRate, color = HClust), size = 0.6, alpha = 0.8) + 
  labs(x= "Pick/Ban Rate", y= "Win Rate") + 
  theme_minimal()+ 
  theme(legend.position = "right")+
     guides(color = guide_legend(override.aes = list(size = 3) ) )

plot1c <- leaguedf %>%
  ggplot() + 
  geom_boxplot(mapping = aes(x = Role, y = RoleRate, color = HClust), lwd = 0.5) +
    labs(x = "Role", y = "Role %") + 
  theme_minimal() + 
  theme(legend.position = "none")



((plot1a | plot1b) / plot1c )& 
  plot_layout(guides = "collect") &
  plot_annotation(title = "Hierarchical Cluster Analysis")  & theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))

leaguedf %>%
  rename(`Hierarchical Cluster` = HClust) %>%
  group_by(`Hierarchical Cluster`) %>%
  summarize(
            `Mean Win Rate` = mean(WinRate),
            `Mean PB Rate` = mean(PickBanRate),
            `Mean Role %` = mean(RoleRate),
            `Mean KDA` = mean(KDA),
            `Median Patch` = median(Patch),
            `Number of Champs` = n_distinct(Name)) %>%
  kbl() %>%
  kable_classic(full_width = F, html_font = "Times New Roman")
Hierarchical Cluster Mean Win Rate Mean PB Rate Mean Role % Mean KDA Median Patch Number of Champs
1 0.4963903 0.0988015 0.5624936 2.089316 11.0 130
2 0.5124245 0.0760563 0.6885434 2.536914 15.0 140
3 0.4449148 0.1436213 0.2178475 2.146066 8.0 12
4 0.5089792 0.4172494 0.8719532 2.705065 11.0 13
5 0.4639125 0.1569875 0.9865125 4.143750 16.5 1
  • Temporary Names for groups:
    • 1: Below Average win rate champs, possibly overpicked
    • 2: Above Average win rate champs, generally underpicked
    • 3: very low win rate champs
    • 4: highly picked champs
    • 5: Yuumi <3

Part 2: Classification Exploration

PCAdf <- leaguedf %>%
  mutate(Class = paste("Class", Class, sep='_')) %>%
  pivot_wider(
    names_from = "Role",
    values_from = "Role",
    values_fn = function (x) 1,
    values_fill = 0
  ) %>%
  pivot_wider(
    names_from = "Class",
    values_from = "Class",
    values_fn = function (x) 1,
    values_fill = 0
  ) %>%
  select(-c(Score, Trend, Tier, Cluster, HClust, Patch, Name)) #i'm taking Patch out since it basically messes with the scaling. Patch is uniform, so variance is maximized when patch is maximized

PCA1 <- prcomp(PCAdf, center = T, scale = F)
eigensum <- function(eigen) {
  answer <- vector(length = length(eigen)) 
  sum = 0
  for (i in 1:length(eigen)) {
    sum = sum + eigen[i]
    answer[i] = sum
  }
  return(answer)
}
eigen <- PCA1$sdev^2

eigFrame <- data.frame(dim = factor(1:length(eigen)), eig = eigen, sum = eigensum(eigen))



Principles <- PCA1$rotation %>%
  dimnames

PCACoef <- PCA1$rotation %>%
  as_tibble() %>%
  select(c(1,2,3))

eigFrame %>%
  ggplot() + 
  geom_bar(mapping = aes(y = eig, x = dim, fill = 'green'), stat = 'identity') + 
  geom_bar(mapping = aes(y = sum, x = dim, fill = 'blue'), stat = 'identity', alpha = 0.3, ) + 
  scale_fill_manual(values = c("green" = "green", "blue" = "blue"), labels = c( "Rolling Summation", "Eigenvalue")) + 
  labs(fill = "Value",x = "Dimension Number", y = "Eigenvalue") + 
  theme(legend.position = c(0.15, 0.95),
        legend.justification = "top")
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Based on the PCA Analysis, we can maybe see that we want 7-9 variables, but we don’t know. Please ask!!!!

Tiers <- unique(leaguedf$Tier)

Stepdf <- leaguedf %>%
  mutate(Class = paste("Class", Class, sep='_')) %>%
  pivot_wider(
    names_from = "Role",
    values_from = "Role",
    values_fn = function (x) 1,
    values_fill = 0
  ) %>%
  pivot_wider(
    names_from = "Class",
    values_from = "Class",
    values_fn = function (x) 1,
    values_fill = 0
  ) %>%
  select(-c(Score, Trend, Patch, Name, HClust, Cluster))
stepResults <- tibble(Tier = c('0'), results = c(list()), .rows = 0)
tiers <- unique(leaguedf$Tier)

for (tier in tiers) {
  tempStep <- Stepdf %>%
    mutate(Tier = Tier == tier)
  
  stepModel <- lm(Tier ~ ., data = tempStep)
    
  stepModel <- step(stepModel, direction = "both", trace=0)
    
  stepResults <- stepResults %>%
    add_row(Tier = tier, results = list(names(summary(stepModel)$aliased)[-1]))
}
plist = vector('list', length = length(tiers))
counter = 1
for (tier in tiers) {
  testVars <- stepResults %>% filter(Tier == tier) %>% select(results)
  testVars <- testVars$results[[1]]
  
  rSquaredFrame <- tibble(features = 1:length(testVars), rSquared = 0)
  tempStep <- Stepdf %>%
    mutate(Tier = Tier == tier)
  varlist <- c()
  for (var in testVars) {
    varlist <- append(varlist, var)
    tempStep2 <- select(tempStep, varlist, Tier)
    model <- lm(Tier ~ ., data = tempStep2)  
    
    rSquaredFrame[length(varlist), 2] = BIC(model)
  }
  
  p1 <- rSquaredFrame %>%
    ggplot(mapping = aes(x = features, y = rSquared)) + 
    geom_point(shape = 4, color = "red")  + 
    geom_line() + 
    geom_vline(xintercept = 7, color = 'blue') + 
    labs(x = "Number of Features", y= "BIC") + 
    theme_minimal()
  plist[[counter]] <- p1
  counter = counter + 1
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(varlist)
## 
##   # Now:
##   data %>% select(all_of(varlist))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot <- Reduce('+', plist)
plot + plot_annotation(title = "BIC Analysis with Bi-Directional Stepwise Regression faceted by Tier", subtitle = "Verticle line at x=7")

We can see that around \(7\) feature for each tier is optimal or near optimal, so we want to find 7 variables to use. To do this, we use a Boruta model, which can naturally deal with high dimensional categorical variables in an efficient way for multi-class regressions. A boruta model generates a decision tree in a specific manner and sees how inclusion and emphasis of different variables differently affects the accuracy of a random forest model. Because of this, it is very good at feature selection for Random Forest models, of which the Ranger model which we use is a subset of. Beyond that, it measures importance by gathering Z scores of mean decrease accuracy measure (DAM).

#Feature Selection

boruta <- Boruta(Tier ~ ., data = select(leaguedf, -c(Score, Trend, Cluster, HClust, PickBanRate)))

plot(boruta, las = 2, cex.axis = 0.7)

We wamt 7 features, and knowing that Role woudl account for 5 in the stepwise regressoin, we can see that WinRate, PickRate, Role, and BanRate are the 4 most likely to be influential variables, and they would make up 8 total features, which is only slightly above what we wanted.

We choose K-Fold over LOOCV for cross validation because 1) K-Fold requires much less computation power and time and this already took 5 hours, and 2) K-Fold can better estimate model accuracy’s for machine learning by reducing bias variance, i.e. training on a smaller set will result in less overfitting. K-Fold cross validation functions as: Partition the dataset \(S\) into \(k\) sets, \(S_1, S_2, \ldots, S_k\), loop over \(1\) to \(k\) with \(i\) as the counter, next train each model on the data set \(S - S_i\), then test each model on the sample \(S_i\) and record the accuracy. When this is finished we compare the mean of each accuracy to best understand the overall standard deviation.

featurecombs <- powerSet(c('WinRate', 'PickRate', 'Role', 'BanRate'))# 2^4 = 16 so 16 samples, for 3 models each with 20 runs gives a total of 960 trained models.
featurecombs <- featurecombs[-1]

leaguedf <- leaguedf %>% select(-c(PickBanRate, HClust, Cluster, Score, Trend)) 
leaguedf$sample <- sample(1:nrow(leaguedf), nrow(leaguedf))

b = nrow(leaguedf)/10

leaguedf <- leaguedf %>%
  mutate(sample = ceiling(sample/b))


crossdf <- data.frame(sample = rep(1:10, each = 15), svm_linear = 0, svm_radial = 0, ranger = 0, WinRate = 0, PickRate = 0, Role = 0, BanRate = 0)



fitControl <- trainControl(method='CV', 
                           number = 3, 
                           verboseIter=F)
#Using a function here makes sure each model stays local and so we end up with a faster training. Consider using this idea elsewhere in the file!!!
trainandtest <- function (method, traindf, testdf, control) {
  if (method == "ranger" & length(colnames(traindf)) < 3) {
    model <- randomForest(
      Tier ~ .,
      data = traindf,
      mtry = 1
    )
  }
  else {
    model <- train(Tier ~ .,
                   data = traindf,
                   method = method,
                   trControl = control)
    testdf$pred <- predict(model, testdf) 
  }
  
  return(mean(testdf$Tier == testdf$pred))
}

featurecount <- 1
# Something is wrong here:
# MTRY For Ranger is auto failing with 1 feature. How do I set mtry low?
for (feature in featurecombs) {
  for (i in 1:10) {
    traindf <- leaguedf %>%
      filter(!(sample == i)) %>%
      select(all_of(feature), Tier)
    testdf <- leaguedf %>%
      filter(sample == i) %>%
      select(all_of(feature), Tier)
    
    #Train models, find the accuracy on the held out sample, and then record the data into the dataframe
    crossdf[((i-1) * 15) + featurecount, 4] <- trainandtest("ranger", traindf, testdf, fitControl)
    crossdf[((i-1) * 15) + featurecount, 2] <- trainandtest("svmLinear", traindf, testdf, fitControl)
    crossdf[((i-1) * 15) + featurecount, 3] <- trainandtest("svmRadial", traindf, testdf, fitControl)
    
    #Record the feature by putting 1 in the columns that have the feature present
    crossdf[((i-1) * 15) + featurecount, 5] <- "WinRate" %in% feature
    crossdf[((i-1) * 15) + featurecount, 6] <- "PickRate" %in% feature
    crossdf[((i-1) * 15) + featurecount, 7] <- "Role" %in% feature
    crossdf[((i-1) * 15) + featurecount, 8] <- "BanRate" %in% feature
    }
  featurecount <- featurecount + 1
  } 
## `summarise()` has grouped output by 'Model'. You can override using the
## `.groups` argument.
#TODO: Fix legends, axis, and axis labels
#Fix legends by adding colors and making manual labels
#Fix Axis labels by adding wrap_text to the feature in the top
#Fix axis by forcing them to all have the same 0 to 1 scale, so that htye look the same
p1 <- crossdf %>% filter(featurenum == 1) %>%
  na.omit() %>%
  ggplot() + 
  geom_bar(mapping = aes(x = Feature, y = Accuracy, fill = Model), stat = "identity", position = "dodge") + 
  theme_minimal() + 
    labs(x = "Features", y = "Accuracy", title = "1 Feature Models") + ylim(0, 1)
  
p2 <- crossdf %>% filter(featurenum == 2) %>%
  na.omit() %>%
  ggplot() + 
  geom_bar(mapping = aes(x = Feature, y = Accuracy, fill = Model), stat = "identity", position = "dodge") + 
  theme_minimal() + 
    labs(x = "Features", y = "Accuracy", title = "2 Feature Models") + ylim(0,1  )

p3 <- crossdf %>% filter(featurenum == 3) %>%
  na.omit() %>%
  ggplot() + 
  geom_bar(mapping = aes(x = Feature, y = Accuracy, fill = Model), stat = "identity", position = "dodge") + 
  theme_minimal() + 
    labs(x = "Features", y = "Accuracy", title = "3 Feature Models") + ylim(0,1 )

p4 <- crossdf %>% filter(featurenum == 4) %>%
  na.omit() %>%
  ggplot() + 
  geom_bar(mapping = aes(x = Feature, y = Accuracy, fill = Model), stat = "identity", position = "dodge") + 
  theme_minimal() + 
    labs(x = "Features", y = "Accuracy", title = "4 Feature Models") + ylim(0,1)

(p1 + p2) / (p3 + p4) + plot_layout(guides = "collect")

Ranger is best slightly with all 4 features, at 75.04% success. We will see if this is different statistically significantly from BanRate/PickRate

Top Accuracy Comparison

bestModels <- crossdf %>% na.omit() %>%
  group_by(Model) %>%
  slice_max(Accuracy, n=3) %>%
  mutate(Model = str_to_title(str_replace(Model, '_', ' ')),
         `Standard Deviation` = round(sd, digits = 4),
         WinRate = as.numeric(str_detect(Feature, "WinRate")),
         PickRate = as.numeric(str_detect(Feature, "PickRate")),
         BanRate = as.numeric(str_detect(Feature, "BanRate")),
         Role = as.numeric(str_detect(Feature, "Role")),
         Accuracy = round(Accuracy, digits = 4)) %>%
  select(-Feature, -featurenum, -sd) %>%
  ungroup()

bestModels %>% formattable(
  list(`WinRate` = formatter("span",
                        x ~ icontext(ifelse(x > 0, "ok", "remove"), ifelse(x > 0, "Yes", "No")),
                        style = x ~ style(color = ifelse(x < 1, "red", "green"))),
       `PickRate` = formatter("span",
                        x ~ icontext(ifelse(x > 0, "ok", "remove"), ifelse(x > 0, "Yes", "No")),
                        style = x ~ style(color = ifelse(x < 1, "red", "green"))),
       `BanRate` = formatter("span",
                        x ~ icontext(ifelse(x > 0, "ok", "remove"), ifelse(x > 0, "Yes", "No")),
                        style = x ~ style(color = ifelse(x < 1, "red", "green"))),
       `Role` = formatter("span",
                        x ~ icontext(ifelse(x > 0, "ok", "remove"), ifelse(x > 0, "Yes", "No")),
                        style = x ~ style(color = ifelse(x < 1, "red", "green"))),
       `Accuracy` = formatter("span", x ~ percent(x)))
)
Model Accuracy Standard Deviation WinRate PickRate BanRate Role
Ranger 75.04% 0.0161 Yes Yes Yes Yes
Ranger 72.17% 0.0154 Yes Yes No Yes
Ranger 66.84% 0.0127 Yes Yes Yes No
Svm Linear 73.57% 0.0168 Yes Yes Yes Yes
Svm Linear 72.61% 0.0134 Yes Yes No Yes
Svm Linear 65.07% 0.0208 Yes Yes Yes No
Svm Radial 74.22% 0.0175 Yes Yes Yes Yes
Svm Radial 73.60% 0.0153 Yes Yes No Yes
Svm Radial 66.40% 0.0209 Yes Yes Yes No

SVM Models are more human readable than a random forest with 5k trees, so we give preference to them.

bestModels %>% slice_max(Accuracy, n=6) %>%
  mutate(meanDiff = max(Accuracy) - Accuracy, z = meanDiff/(sqrt(`Standard Deviation`^2/10 + 0.0161^2/10)), pval = round(pnorm(q=z, lower.tail=F), 4)) %>%
  kbl() %>%
  kable_classic(full_width = F, html_font = "cambria")
Model Accuracy Standard Deviation WinRate PickRate BanRate Role meanDiff z pval
Ranger 0.7504 0.0161 1 1 1 1 0.0000 0.000000 0.5000
Svm Radial 0.7422 0.0175 1 1 1 1 0.0082 1.090468 0.1378
Svm Radial 0.7360 0.0153 1 1 0 1 0.0144 2.050251 0.0202
Svm Linear 0.7357 0.0168 1 1 1 1 0.0147 1.997736 0.0229
Svm Linear 0.7261 0.0134 1 1 0 1 0.0243 3.668494 0.0001
Ranger 0.7217 0.0154 1 1 0 1 0.0287 4.073607 0.0000

Hence, our Ranger model is NOT better that our Radial SVM model with all features within a p=0.05, however, all other models are. This is happy! Because we have projected our data set down by a lot and reduced the amount of variables we need and whatnot. WE can better visualize this now using

#Hypertuning parameters
#Algorithm is a greedy alg i came up with that goes as follows:
#1. Test with parameters, find the best fit, if it worst than the last then we are done, save the model
#2. Otherwise, we are not done. Take C and Sigma for the best fit of the last set of params, create an interval around them of [C - delta, C + delta], where both Delta are the distance (Cmax - Cmin)/5, and the same for Sigma
#3. If the new C interval < 0.1 in length, we are done because we have gone far enoguh. Save the model
#4. Repeat until complete.

#NOTE THAT THERE MAY BE A BUG WHERE IF WE OVERTRAIN THE MODEL (last was better than all current) WE DON'T GET THE ORIGINAL MODEL BACK!!!! POSSIBLY WANT TO FIX THIS BBG <3 But could always analyze and see cmax - cmin /2 for last C value
#Also likely has a memory leak since none of the models are locally defined. 

#improvement idea to fix both issues : Define train function that returns either previous best C and best sigma OR new best C and best sigma? Locally defined since function and if we can keep best C and sigma we can train a model on those specifically!

grid <- expand.grid(C = seq(0, 5, length = 5), sigma = seq(0, 1, length = 5))

cmin = 0
cmax = 5
sigmamin = 0
sigmamax = 0.3
done = F
last_acc = 0

fitControl <- trainControl(method='CV', 
                           number = 5, 
                           verboseIter=F)
while (done == F) {
  svmrad_fit <- train(Tier ~ .,
                   data = select(leaguedf, c(WinRate, PickRate, BanRate, Role, Tier)),
                   method="svmRadial",
                   trControl = fitControl,
                   tuneGrid = grid)
  best_fit <- svmrad_fit$results %>%
    slice_max(Accuracy, n=1) 
  print(best_fit)
  
  
  if (last_acc > best_fit$Accuracy[1]) {
    print("Best fit Higher then last Accuracy")
    done = T
  }
  else {
    if (best_fit$C[1] == cmax) {
      cmin = cmin + cmax
      cmax = cmax * 2
    }
    if (best_fit$sigma[1] == sigmamax) {
      sigmamin = sigmamin + sigmamax
      sigmamax = sigmamax *2
    }
    else {
      cmin <- svmrad_fit$results %>%
        filter(C < best_fit$C) %>%
        slice_max(C, n = 1)
      cmin <- cmin$C[1]
      
      cmax <- svmrad_fit$results %>%
        filter(C > best_fit$C) %>%
        slice_min(C, n = 1)
      cmax <- cmax$C[1]
      
      sigmamin = svmrad_fit$results %>%
        filter(sigma < best_fit$sigma) %>%
        slice_max(sigma, n = 1)
      sigmamin = sigmamin$sigma[1]
      
      sigmamax = svmrad_fit$results %>%
        filter(sigma > best_fit$sigma) %>%
        slice_min(sigma, n =1)
      sigmamax = sigmamax$sigma[1]
      
      if (abs(cmin - cmax) < 0.01) {
        print("Within 0.01")
        print(abs(cmin - cmax))
        done = T      }
      grid <- expand.grid(C = seq(cmin, cmax, length = 5), sigma = seq(sigmamin, sigmamax, length = 5))
      last_acc <- best_fit$Accuracy[1]
      print(paste("New cmin is: ", cmin, "New cmax is:", cmax,  sep = ' '))
      }
    }
}
svmrad_fit <- train(Tier ~ .,
                    data  = select(leaguedf, c(WinRate, PickRate, BanRate, Role, Tier)),
                    method= "svmRadial",
                    trControl = fitControl,
                    tuneGrid = grid)
saveRDS(svmrad_fit, "SVMRad_Fit.rds")
tiers = c("D", "C", "B", "A", "S", "God")
roles = c("Top","Jungle", "Mid", "Adc", "Support")

PlottingData <- leaguedf %>%
    select(c(WinRate, PickRate, BanRate, Role, Tier)) %>%
    mutate(Tier = fct_recode(Tier, 
      '1' = 'D',
      '2' = 'C',
      '3' = 'B',
      '4' = 'A',
      '5' = 'S',
      '6' = 'God'
    )) %>%
    mutate(Tier = as.numeric(as.character(Tier)))


open3d()
#TOP
Plotting <- PlottingData %>%
  filter(Role == "Top")

subtop <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Top")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]

#Make one surface for each plot and save.

Topplot <-MakeContourPlot(contour.df, contour.list, Plotting, "Top")

#Now take each surface and generate a plot, the same way a loop would, but save it to a unique variable


#Repeat the steps every single time.
#MID
Plotting <- PlottingData %>%
  filter(Role == "Mid")

submid <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Mid")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]

Midplot <- MakeContourPlot(contour.df, contour.list, Plotting, "Mid")

#ADC

Plotting <- PlottingData %>%
  filter(Role == "Adc")

subadc <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Adc")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]

Adcplot <- MakeContourPlot(contour.df, contour.list, Plotting, "Adc")

#SUPPORT

Plotting <- PlottingData %>%
  filter(Role == "Support")

subsupport <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Support")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]

Supportplot <- MakeContourPlot(contour.df, contour.list, Plotting, "Support")


#JUNGLE
Plotting <- PlottingData %>%
  filter(Role == "Jungle")

subjungle <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Jungle")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]

Jungleplot <- MakeContourPlot(contour.df, contour.list, Plotting, "Jungle")

close3d()
#There is behind the scenes work here with Flexdashboard used to create our widget below. If you want to see it, go to https://github.com/DhAiBt/STOR-320-Group-13 and look at FinalPaper/ContourPlot.nb :-)

TempChamps Analysis

#Special Analysis just for TempChamps
#TODO: Fix table format so it isn't disgusting
leaguedf$Pred_svmrad = predict(svmrad_fit, leaguedf)
pretable <- leaguedf %>%
  group_by(Name, Role) %>%
  mutate(n = n()) %>%
  ungroup()  %>%
  mutate(Temp = n < 23, correct = Tier == Pred_svmrad) %>%
  group_by(Temp) %>%
  summarize(Acc = mean(correct), n = n(), sd = sd(correct))

#Assume that the true probablility is correct. Then, we have H0 as Temp >= NonTemp, and H1 as Temp < NonTemp.

table <- tibble(
  NonTemporary = pretable[[1,2]], Temporary = pretable[[2,2]], NonTemporaryN = 4876, TemporaryN = 761, sdTemporary = pretable[[1,4]], sdNonTemporary = pretable[[2,4]]
) %>%
  mutate(diff  = NonTemporary - Temporary, z = diff/ (sqrt(sdTemporary^2 / TemporaryN + sdNonTemporary^2 / NonTemporaryN)), p = pnorm(q = z, lower.tail = F)) %>%
  kbl() %>%
  kable_classic()

pretable %>%
  kbl() %>%
  kable_classic
Temp Acc n sd
FALSE 0.7807629 4876 0.4137720
TRUE 0.7095926 761 0.4542489
table
NonTemporary Temporary NonTemporaryN TemporaryN sdTemporary sdNonTemporary diff z p
0.7807629 0.7095926 4876 761 0.413772 0.4542489 0.0711703 4.353149 6.7e-06

CONCLUSION

IN LESS THAN 4 PARAGRAPHS, YOU SHOULD RESTATE YOUR QUESTIONS ALONG WITH YOUR CONCLUSIONS. THE PURPOSE OF THIS SECTION IS TO SUMMARIZE YOUR FINDINGS (SHORT), DEFEND THE IMPORTANCE OF YOUR RESULTS IN THE REAL WORLD (LONG), AND PROVIDE A ROADMAP FOR OTHERS TO CONTINUE THIS WORK (LONG). ARE YOUR CONCLUSIONS WHAT YOU EXPECTED OR UNUSUAL? WHY SHOULD SOMEONE CARE ABOUT THESE RESULTS? HOW COULD THESE RESULTS BE USED IN THE REAL WORLD? YOU SHOULD PROVIDE IDEAS ABOUT FUTURE DIRECTIONS ON WHERE YOUR MODELING COULD POSSIBLY BE IMPROVED. ARE THERE ANY METHODS YOU DIDN’T USE THAT MAY WORK BETTER? IS THERE DATA YOU DIDN’T HAVE ACCESS TO THAT MAY BE USEFUL IN THIS DATA ANALYSIS?

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

Boundarydf = expand.grid(WinRate = seq(0.4, 0.6, length.out=30),
                         PickRate = seq(0.0, 0.5, length.out= 20), 
                         BanRate = seq(0.0, 0.5, length.out = 20), 
                         RoleRate = seq(0.0, 1, length.out=20),
                         KDA = seq(1, 3, length.out = 30),
                         Role = "Top", 
                         ID = c("Camille_Fighter"), 
                         Patch = 13) %>%
  mutate(Name = str_replace(ID, "_.*", ''),
         Class = str_replace(ID, ".*_", ''))
Boundarydf$tier <- predict(rf_fit, Boundarydf)
Boundarydf <- Boundarydf %>%
  group_by(WinRate, KDA) %>%
  summarize(Tier = Mode(tier)) %>%
  ungroup()

write.csv(Boundarydf, "BoundaryFrame.csv")
#Visualization #1 for Ranger Model
acc_plot <- rf_fit$results %>%  
  mutate(mtry = as.character(mtry), min.node.size = as.character(min.node.size)) %>%
  mutate(mtry = factor(mtry, levels=c("6", "9", "12", "15"), ordered=TRUE),
         min.node.size = factor(min.node.size, levels = c("1","3", "6", "10"), ordered=TRUE)) %>%
  ggplot() + geom_tile(mapping = aes(x=mtry, y=min.node.size, fill=Accuracy)) + facet_wrap(splitrule ~.) + ggtitle("Accuracy for each value") + theme(legend.position = "bottom")

rf_pred <- predict(rf_fit, leagueTest)

cm <- confusionMatrix(rf_pred, leagueTest$Tier, dnn = c("Prediction", "Actual"))
plt <- as.data.frame(cm$table) %>%
  group_by(Actual) %>%
  mutate(Percent = Freq / sum(Freq))

plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))
cm_plot <- ggplot(plt, aes(Prediction,Actual, fill=Percent)) +
        geom_tile() + geom_text(aes(label=Freq)) +
        scale_fill_gradient(low="white", high="#009194") +
        labs(x = "Actual",y = "Prediction", title = "Confusion Matrix") +
        scale_y_discrete(labels=c('D', 'C', 'B', 'A', 'S', 'God')) +
        scale_x_discrete(labels=c('God', 'S', 'A', 'B', 'C', 'D')) + theme_minimal() + theme(legend.position = "bottom")

boundary_plot <- leaguedf %>%
  filter(Role == 'Top'& Patch == 23 & (Class == "Fighter"| Class == "Tank"))  %>%
  ggplot() + geom_point(mapping = aes(x = KDA, y = WinRate, color = Tier)) + geom_raster(data = Boundarydf, mappign = aes(x = KDA, y = WinRate, fill = Tier))
  
(acc_plot + cm_plot) /boundary_plot